Functions used in Syllabus¶

Author: N.J. de Winter (n.j.de.winter@vu.nl)
Assitant Professor Vrije Universiteit Amsterdam
Statistics and Data Analysis Course

Learning goals:¶

  • Show how the figures in the syllabus are made
  • Get familiar with Jupyter and Python

Introduction¶

This Notebook is not part of the assignments, but just shows how some of the custrom figures used in the course syllabus are produced. It also serves as a repository for reproducing these figures for future versions of the syllabus.

In [ ]:
# Import required packages

# Basic packages
import pandas as pd # The 'pandas' package helps us to import and manage data
import random as random # To randomly sample data
import numpy as np # To modify and work with datasets
import scipy.stats as stats # The 'scipy' package contains statistical formulas we will need

# Correlation and regression
import statsmodels.formula.api as smf # The 'statsmodels' package contains the functions needed to do regressions
import statsmodels.api as sm # The 'statsmodels' package contains the functions we need to run regression models
from scipy.stats import pearsonr # For calculating the Pearson's r

# Plotting
import matplotlib.pyplot as plt # For plotting
import matplotlib.patches as patches # To add shapes to plots

# Multivariate analysis
from scipy.stats import multivariate_normal # To generate multivariate normal distributions
from mpl_toolkits.mplot3d import Axes3D # To plot 3D graphs

# Clustering and classification
from scipy.cluster.hierarchy import dendrogram, linkage # to plot dendrograms
from scipy.spatial.distance import pdist, squareform # to perform hierarchical classification and data preparation

# Factor Analysis
from sklearn.decomposition import PCA # Needed for PCA and principle axis analysis

# Time Series Analysis
from scipy.io import loadmat # Load Matlab .mat files as data
from statsmodels.graphics.tsaplots import plot_acf # For plotting the autocorrelation function (ACF)
from scipy.signal import spectrogram # For spectral analysis

# Extreme value theory
from scipy.stats import gumbel_r # Function to plot the Gumbel distribution

# Spatial analysis
from scipy.spatial import cKDTree # Helps with nearest neighbour interpolation
from scipy.interpolate import griddata # To create spatial gridded data

# The line below allows us to visualize plots we make directly in the notebook
%matplotlib inline

Figure 3: covariance and correlation examples¶

The code below creates variables x and y and colors datapoints based on how they contribute to the covariance between the variables.

In [ ]:
# Figure 3

# Define parameters
num_samples = 10
mean = [0, 0]
covariance_matrix = [[1, 0.6], [0.6, 1]]

# Generate bivariate dataset with Pearson's r correlation of 0.6
random.seed(0) # Set seed for reproducibility
x, y = np.random.multivariate_normal(mean, covariance_matrix, num_samples).T # create x and y variables from a normal distribution

# Scale x values to range between -10 and 10
x = x * 10 / max(abs(x))

# Scale y values to range between 0 and 100
y = (y - min(y)) * 100 / (max(y) - min(y))

# Check the actual correlation coefficient
correlation_coefficient, p_value = pearsonr(x, y)

# Calculate mean values of x and y
mean_x = np.mean(x)
mean_y = np.mean(y)

# Create a mask for points above and below mean values
mask_above = (x >= mean_x) & (y >= mean_y)
mask_below = (x <= mean_x) & (y <= mean_y)

# Visualize the data
import matplotlib.pyplot as plt

plt.scatter(x, y, alpha = 0.5, label = 'negative covariance contribution') # Plot points
plt.scatter(x[mask_above], y[mask_above], color = 'red', alpha = 0.5, label = 'positive covariance contribution') # color points with values for both x and y above the means red (positive contribution to covariance)
plt.scatter(x[mask_below], y[mask_below], color = 'red', alpha = 0.5, label = 'positive covariance contribution') # color points with values for both x and y below the means red (positive contribution to covariance)
plt.axhline(mean_y, color = 'green', linestyle = '--', label = 'Mean Y')
plt.axvline(mean_x, color = 'green', linestyle = '--', label = 'Mean X')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Bivariate Dataset with Pearson\'s correlation coefficient = ' + str(round(correlation_coefficient, 2)))
plt.grid(True)
plt.show()
No description has been provided for this image

The code below does the same, but with y being related to x by a second order polinomial (quadratic function). The linear correlation is very small (-0.03)

In [ ]:
# Define parameters
num_samples = 10
x_range = (-10, 10)

# Generate x values
random.seed(5)
x = np.random.uniform(x_range[0], x_range[1], num_samples)

# Compute y values based on a quadratic function of x plus
a, b, c = np.random.uniform(-2, 2, 3) # coefficients for the quadratic function
y = a * x**2 + b * x + c

# Check the actual correlation coefficient
correlation_coefficient2, p_value2 = pearsonr(x, y)

# Fit a second-order polynomial curve to the data
smoothed_y = np.polyval((a, b, c), range(-10, 10))

# Calculate mean values of x and y
mean_x = np.mean(x)
mean_y = np.mean(y)

# Create a mask for points above and below mean values
mask_above = (x > mean_x) & (y > mean_y)
mask_below = (x < mean_x) & (y < mean_y)

# Plot the data with different colors
import matplotlib.pyplot as plt

plt.scatter(x, y, alpha=0.5, label = 'negative covariance contribution') # Plot points
plt.scatter(x[mask_above], y[mask_above], color='red', alpha=0.5, label = 'positive covariance contribution') # color points with values for both x and y above the means red (positive contribution to covariance)
plt.scatter(x[mask_below], y[mask_below], color='red', alpha=0.5, label = 'positive covariance contribution') # color points with values for both x and y below the means red (positive contribution to covariance)
plt.plot(range(-10, 10), smoothed_y, color='orange', label='Smoothed Curve') # Plot polynomial function through points
plt.axhline(mean_y, color='green', linestyle='--', label='Mean Y')
plt.axvline(mean_x, color='green', linestyle='--', label='Mean X')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Bivariate Dataset with Pearson\'s correlation coefficient = ' + str(round(correlation_coefficient2, 2)))
plt.grid(True)
plt.show()
No description has been provided for this image

Figure 5: Scatterplot of the elevation and soil organic matter weight % variables¶

In [ ]:
# Define variables
elev = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil = np.array([8.2, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 28.6, 29.1, 35])

# Create scatterplot
plt.scatter(elev, OMsoil, marker = '+', s = 100, color = "red") # Plot points
plt.xlabel('elevation (m)')
plt.ylabel('soil organic matter weight (%)')
plt.xlim(5, 20)
plt.ylim(5, 40)
Out[ ]:
(5.0, 40.0)
No description has been provided for this image

Figure 6: A regression for describing the relation between soil organic matter and elevation¶

In [ ]:
# Define variables
elev = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil = np.array([8.2, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 28.6, 29.1, 35])

# Perform linear regression
slope, intercept = np.polyfit(elev, OMsoil, 1)

print(slope)
print(intercept)

# Create scatterplot
plt.scatter(elev, OMsoil, marker = '+', s = 100, color = "red") # Plot points
plt.plot(np.array([5, 20]), slope * np.array([5, 20]) + intercept, color='black', label='Linear Regression')
plt.xlabel('elevation (m)')
plt.ylabel('soil organic matter weight (%)')
plt.xlim(5, 20)
plt.ylim(5, 40)

# Print the formula for the regression line
plt.text(10, 30, f'Regression Line: y = {slope:.2f}x + {intercept:.2f}', fontsize=12, color='black')

# Add annotation with arrow
x_value = 12.5
y_value = slope * x_value + intercept
plt.annotate('', xy=(x_value, y_value), xytext=(x_value, 10.1), arrowprops=dict(arrowstyle='<->', color='blue'))
-3.235047436610967
54.501975123283394
Out[ ]:
Text(12.5, 10.1, '')
No description has been provided for this image

Figure 7: A near-horizontal regression line with large deviations may indicate that a relationship between the dependent and independent variable is absent.¶

In [ ]:
# Define variables
elev = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil = np.array([7.8, 15.1, 9.8, 7.6, 14.8, 17.7, 10.4, 5.5, 15, 7.9])

# Perform linear regression
slope, intercept = np.polyfit(elev, OMsoil, 1)

print(slope)
print(intercept)

# Create scatterplot
plt.scatter(elev, OMsoil, marker = '+', s = 100, color = "red") # Plot points
plt.plot(np.array([5, 20]), slope * np.array([5, 20]) + intercept, color='blue', label='Linear Regression')
plt.xlabel('elevation (m)')
plt.ylabel('soil organic matter weight (%)')
plt.xlim(6, 18)
plt.ylim(0, 25)
0.06701958297200489
10.418093216499908
Out[ ]:
(0.0, 25.0)
No description has been provided for this image

Figure 8: Plots of a the total variance, estimated variance and residual variance for a regression.¶

In [ ]:
# Variables and regression
# Dataset 1
elev = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil = np.array([7.8, 15.1, 9.8, 7.6, 14.8, 17.7, 10.4, 5.5, 15, 7.9])
slope, intercept = np.polyfit(elev, OMsoil, 1)
mean_omsoil = np.mean(OMsoil)

# Dataset 2
elev_2 = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil_2 = np.array([8.2, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 28.6, 29.1, 35])
slope_2, intercept_2 = np.polyfit(elev_2, OMsoil_2, 1)
mean_omsoil_2 = np.mean(OMsoil_2)

# Create subplots
fig, axs = plt.subplots(2, 3, figsize=(15, 12))

# Plot A
axs[0, 0].scatter(elev, OMsoil, marker = '+', s = 100, color = "red")
axs[0, 0].plot(elev, slope * elev + intercept, color='blue', label='Linear Regression')
axs[0, 0].axhline(y=mean_omsoil, color='red', linestyle='--', label='Mean OM Soil')
for i in range(len(elev)):
    axs[0, 0].plot([elev[i], elev[i]], [OMsoil[i], mean_omsoil], color='red', linestyle='-', linewidth=0.5)
axs[0, 0].set_title('A. Total variance')
axs[0, 0].set_xlabel('elevation (m)')
axs[0, 0].set_ylabel('soil organic matter weight (%)')
axs[0, 0].set_xlim(6, 18)
axs[0, 0].set_ylim(0, 25)

# Plot B
axs[0, 1].scatter(elev, OMsoil, marker = '+', s = 100, color = "red")
axs[0, 1].plot(elev, slope * elev + intercept, color='blue', label='Linear Regression')
for i in range(len(elev)):
    axs[0, 1].plot([elev[i], elev[i]], [OMsoil[i], slope * elev[i] + intercept], color='blue', linestyle='-', linewidth=0.5)
axs[0, 1].set_title('B. Variance of the residuals')
axs[0, 1].set_xlabel('elevation (m)')
axs[0, 1].set_ylabel('soil organic matter weight (%)')
axs[0, 1].set_xlim(6, 18)
axs[0, 1].set_ylim(0, 25)

# Plot C
axs[0, 2].scatter(elev, OMsoil, marker = '+', s = 100, color = "red")
axs[0, 2].plot(elev, slope * elev + intercept, color='blue', label='Linear Regression')
for i in range(len(elev)):
    axs[0, 2].plot([elev[i], elev[i]], [mean_omsoil, slope * elev[i] + intercept], color='green', linestyle='-', linewidth=0.5)
axs[0, 2].plot(elev, OMsoil, '+', color='blue')
axs[0, 2].axhline(y=mean_omsoil, color='red', linestyle='--', label='Mean OM Soil')
axs[0, 2].set_title('C. Variance of the regression')
axs[0, 2].set_xlabel('elevation (m)')
axs[0, 2].set_ylabel('soil organic matter weight (%)')
axs[0, 2].set_xlim(6, 18)
axs[0, 2].set_ylim(0, 25)

# Plot D
axs[1, 0].scatter(elev_2, OMsoil_2, marker = '+', s = 100, color = "red")
axs[1, 0].plot(elev_2, slope_2 * elev_2 + intercept_2, color='blue', label='Linear Regression')
axs[1, 0].axhline(y=mean_omsoil_2, color='red', linestyle='--', label='Mean OM Soil')
for i in range(len(elev_2)):
    axs[1, 0].plot([elev_2[i], elev_2[i]], [OMsoil_2[i], mean_omsoil_2], color='red', linestyle='-', linewidth=0.5)
axs[1, 0].set_title('D. Total variance')
axs[1, 0].set_xlabel('elevation (m)')
axs[1, 0].set_ylabel('soil organic matter weight (%)')
axs[1, 0].set_xlim(6, 18)
axs[1, 0].set_ylim(5, 40)

# Plot E
axs[1, 1].scatter(elev_2, OMsoil_2, marker = '+', s = 100, color = "red")
axs[1, 1].plot(elev_2, slope_2 * elev_2 + intercept_2, color='blue', label='Linear Regression')
for i in range(len(elev_2)):
    axs[1, 1].plot([elev_2[i], elev_2[i]], [OMsoil_2[i], slope_2 * elev_2[i] + intercept_2], color='blue', linestyle='-', linewidth=0.5)
axs[1, 1].set_title('E. Variance of the residuals')
axs[1, 1].set_xlabel('elevation (m)')
axs[1, 1].set_ylabel('soil organic matter weight (%)')
axs[1, 1].set_xlim(6, 18)
axs[1, 1].set_ylim(5, 40)

# Plot F
axs[1, 2].scatter(elev_2, OMsoil_2, marker = '+', s = 100, color = "red")
axs[1, 2].plot(elev_2, slope_2 * elev_2 + intercept_2, color='blue', label='Linear Regression')
for i in range(len(elev_2)):
    axs[1, 2].plot([elev_2[i], elev_2[i]], [mean_omsoil_2, slope_2 * elev_2[i] + intercept_2], color='green', linestyle='-', linewidth=0.5)
axs[1, 2].plot(elev_2, OMsoil_2, '+', color='blue')
axs[1, 2].axhline(y=mean_omsoil_2, color='red', linestyle='--', label='Mean OM Soil')
axs[1, 2].set_title('F. Variance of the regression')
axs[1, 2].set_xlabel('elevation (m)')
axs[1, 2].set_ylabel('soil organic matter weight (%)')
axs[1, 2].set_xlim(6, 18)
axs[1, 2].set_ylim(5, 40)

# Adjust layout
plt.tight_layout()
No description has been provided for this image

Figure 9: Covariance and correlation example with polynomial curve¶

In [ ]:
# Define parameters
num_samples = 10
x_range = (-10, 10)

# Generate x values
random.seed(5)
x = np.random.uniform(x_range[0], x_range[1], num_samples)

# Compute y values based on a quadratic function of x plus
a, b, c = np.random.uniform(-2, 2, 3) # coefficients for the quadratic function
y = a * x**2 + b * x + c

# Check the actual correlation coefficient
correlation_coefficient2, p_value2 = pearsonr(x, y)

# Fit a linear regression to the data
slope, intercept = np.polyfit(x, y, 1)

# Fit a second-order polynomial curve to the data
smoothed_y = np.polyval((a, b, c), range(-10, 10))

# Calculate mean values of x and y
mean_x = np.mean(x)
mean_y = np.mean(y)

# Create a mask for points above and below mean values
mask_above = (x > mean_x) & (y > mean_y)
mask_below = (x < mean_x) & (y < mean_y)

# Plot the data with different colors
import matplotlib.pyplot as plt

plt.scatter(x, y, label = 'negative covariance contribution') # Plot points
plt.plot(range(-10, 10), slope * range(-10, 10) + intercept, color='blue', label='Linear Regression') # Plot the linear regression line
plt.plot(range(-10, 10), smoothed_y, color='orange', label='Smoothed Curve') # Plot polynomial function through points
plt.axhline(mean_y, color='green', linestyle='--', label='Mean Y')
plt.axvline(mean_x, color='green', linestyle='--', label='Mean X')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Bivariate Dataset with Pearson\'s correlation coefficient = ' + str(round(correlation_coefficient2, 2)))
plt.grid(True)
plt.show()
No description has been provided for this image

Figure 10: Linear regression through a dataset before and after outlier detection¶

In [ ]:
# Define variables
elev = np.array([15, 14.8, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.8, 8.4, 7.5, 7.1, 6.5])
OMsoil = np.array([8.2, 21.1, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 14.5, 28.6, 29.1, 35, 15])

# Perform linear regression
slope, intercept = np.polyfit(elev, OMsoil, 1)

# Check the correlation coefficient with outliers
correlation_coefficient, p_value = pearsonr(elev, OMsoil)

# Remove outliers
elev_clean = np.array([15, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.4, 7.5, 7.1])
OMsoil_clean = np.array([8.2, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 28.6, 29.1, 35])

# Perform linear regression
slope_clean, intercept_clean = np.polyfit(elev_clean, OMsoil_clean, 1)

# Check the correlation coefficient with outliers
correlation_coefficient_clean, p_value_clean = pearsonr(elev_clean, OMsoil_clean)

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(15, 8))

# Plot A
axs[0].scatter(elev, OMsoil, marker = '+', s = 100, color = "red") # Plot points
axs[0].scatter(elev_clean, OMsoil_clean, marker = '+', s = 100, color = "blue") # Plot non-outliers
axs[0].plot(np.array([5, 20]), slope * np.array([5, 20]) + intercept, color='black', label='Linear Regression')
axs[0].set_title('A. With outliers')
axs[0].set_xlabel('elevation (m)')
axs[0].set_ylabel('soil organic matter weight (%)')
axs[0].set_xlim(5, 20)
axs[0].set_ylim(5, 40)

# Print the formula for the regression line
axs[0].text(10, 30, f'Regression Line with outliers:\ny = {slope:.2f}x + {intercept:.2f}\nPearsons r = {correlation_coefficient:.2f}', fontsize=12, color='black')

# # Plot B
axs[1].scatter(elev_clean, OMsoil_clean, marker = '+', s = 100, color = "blue") # Plot non-outliers
axs[1].plot(np.array([5, 20]), slope_clean * np.array([5, 20]) + intercept_clean, color='black', label='Linear Regression')
axs[1].set_title('B. Without outliers')
axs[1].set_xlabel('elevation (m)')
axs[1].set_ylabel('soil organic matter weight (%)')
axs[1].set_xlim(5, 20)
axs[1].set_ylim(5, 40)

# Print the formula for the regression line
axs[1].text(10, 30, f'Regression Line with outliers:\ny = {slope_clean:.2f}x + {intercept_clean:.2f}\nPearsons r = {correlation_coefficient_clean:.2f}', fontsize=12, color='black')
Out[ ]:
Text(10, 30, 'Regression Line with outliers:\ny = -3.24x + 54.50\nPearsons r = -0.96')
No description has been provided for this image

Figure 12: Transformed exponential relationship example A¶

In [ ]:
# load CO2 vs temperature data
df = pd.read_csv('CO2_temperature.csv') # Load the second dataset for this assignment in the Jupyter environment.

# Create linear regression between pCO2 and Year in dataset df
linregression = smf.ols(formula = "df.pCO2 ~ df.Year", data = df).fit()

# Print the regression coefficients of the new linear regression
print('The regression coefficients are:\n', linregression.params)

# Print the regression summary to check the strength and the significance of the linear model
print(linregression.summary())

# Plot pCO2 vs time (Plot A)
plt.scatter(df.Year, df.pCO2, color = 'green', marker = '+')
plt.xlabel('Year')
plt.ylabel('pCO2 (in ppmV)')
plt.plot(df.Year, linregression.params[0] + linregression.params[1] * df.Year, color = 'blue')
plt.title('Atmospheric CO2 concentration over time')
The regression coefficients are:
 Intercept   -1008.320818
df.Year         0.685790
dtype: float64
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                df.pCO2   R-squared:                       0.824
Model:                            OLS   Adj. R-squared:                  0.823
Method:                 Least Squares   F-statistic:                     613.5
Date:                Thu, 01 Aug 2024   Prob (F-statistic):           2.87e-51
Time:                        08:19:57   Log-Likelihood:                -551.53
No. Observations:                 133   AIC:                             1107.
Df Residuals:                     131   BIC:                             1113.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1008.3208     53.968    -18.684      0.000   -1115.083    -901.559
df.Year        0.6858      0.028     24.769      0.000       0.631       0.741
==============================================================================
Omnibus:                       10.556   Durbin-Watson:                   0.014
Prob(Omnibus):                  0.005   Jarque-Bera (JB):               11.071
Skew:                           0.672   Prob(JB):                      0.00394
Kurtosis:                       2.561   Cond. No.                     7.87e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.87e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
C:\Users\nwi213\AppData\Local\Temp\ipykernel_18396\612494119.py:17: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  plt.plot(df.Year, linregression.params[0] + linregression.params[1] * df.Year, color = 'blue')
Out[ ]:
Text(0.5, 1.0, 'Atmospheric CO2 concentration over time')
No description has been provided for this image

Figure 13: Transformed exponential relationship example B¶

In [ ]:
# load CO2 vs temperature data
df = pd.read_csv('CO2_temperature.csv') # Load the second dataset for this assignment in the Jupyter environment.

# Create relative pCO2 and Year variables
df['pCO2rel'] = df.pCO2 - 280
df['Yearrel'] = df.Year - 1850

df['logCO2_rel'] = np.log(df.pCO2rel) # Create a new variable that is the natural logarithm of the modified pCO2 variable

expreg = smf.ols(formula = "logCO2_rel ~ Yearrel", data = df).fit() # Perform the 'exponential' regression in its linearized form

print(expreg.summary()) # Print the regression summary

c2 = np.exp(expreg.params[0]) # Calculate the parameter c by taking the natural exponent

# Plot log(pCO2) vs time (Plot B)
plt.scatter(df.Yearrel, df.logCO2_rel, color = 'green', marker = '+')
plt.xlabel('Year (relative to 1850)')
plt.ylabel('log(pCO2) (280 ppmV = 1)')
plt.plot(df.Yearrel, expreg.params[0] + expreg.params[1] * df.Yearrel, color = 'purple')
plt.title('Natural logarithm of atmospheric CO2 concentration over time')
                            OLS Regression Results                            
==============================================================================
Dep. Variable:             logCO2_rel   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                     5689.
Date:                Thu, 01 Aug 2024   Prob (F-statistic):          8.41e-110
Time:                        08:20:00   Log-Likelihood:                 77.814
No. Observations:                 133   AIC:                            -151.6
Df Residuals:                     131   BIC:                            -145.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.7155      0.027     64.059      0.000       1.662       1.768
Yearrel        0.0184      0.000     75.427      0.000       0.018       0.019
==============================================================================
Omnibus:                       41.839   Durbin-Watson:                   1.378
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              151.089
Skew:                          -1.081   Prob(JB):                     1.55e-33
Kurtosis:                       7.753   Cond. No.                         250.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\nwi213\AppData\Local\Temp\ipykernel_18396\4126999029.py:14: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  c2 = np.exp(expreg.params[0]) # Calculate the parameter c by taking the natural exponent
C:\Users\nwi213\AppData\Local\Temp\ipykernel_18396\4126999029.py:20: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  plt.plot(df.Yearrel, expreg.params[0] + expreg.params[1] * df.Yearrel, color = 'purple')
Out[ ]:
Text(0.5, 1.0, 'Natural logarithm of atmospheric CO2 concentration over time')
No description has been provided for this image

Figure 14: Transformed exponential relationship example C¶

In [ ]:
# Plot the exponential model on top of the pCO2 data (Plot C)
plt.scatter(df.Year, df.pCO2, color = 'green', marker = '+')
plt.xlabel('Year')
plt.ylabel('pCO2 (ppmV)')
plt.plot(df.Year, np.exp(expreg.params[0]) * np.exp(expreg.params[1] * (df.Yearrel)) + 280, color = 'purple')
plt.title('Atmospheric CO2 concentration over time')
C:\Users\nwi213\AppData\Local\Temp\ipykernel_18396\1816463497.py:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  plt.plot(df.Year, np.exp(expreg.params[0]) * np.exp(expreg.params[1] * (df.Yearrel)) + 280, color = 'purple')
Out[ ]:
Text(0.5, 1.0, 'Atmospheric CO2 concentration over time')
No description has been provided for this image

Figure 17: Simple linear regression result depends on which of the variables is considered the independent variable.¶

In [ ]:
# Define variables (same as Figure 5)
grainsize = np.array([15, 14.8, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.8, 8.4, 7.5, 7.1, 6.5])
sedrate = np.array([8.2, 21.1, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 14.5, 28.6, 29.1, 35, 15])

# Perform linear regression in two directions
slope1, intercept1 = np.polyfit(grainsize, sedrate, 1)
slope2, intercept2 = np.polyfit(sedrate, grainsize, 1)

# Create scatterplot
plt.scatter(grainsize, sedrate, marker = '+', s = 100, color = "red") # Plot points
plt.plot(np.array([5, 20]), slope1 * np.array([5, 20]) + intercept1, color='black', label='Linear Regression 1: sedimentation rate ~ grain size')
plt.plot(np.array([5, 20]), (np.array([5, 20]) - intercept2) / slope2, color='red', label='Linear Regression 2: grain size ~ Sedimentation rate')
plt.xlabel('grainsize (GS; mm)')
plt.ylabel('sedimentation rate (SR; cm/kyr)')
plt.xlim(5, 20)
plt.ylim(5, 40)

# Print the formula for the regression line
plt.text(10, 30, f'Regression 1: SR = {slope1:.2f} * GS + {intercept1:.2f}', fontsize=12, color='black')
plt.text(10, 25, f'Regression 2: GS = {slope2:.2f} * SR + {intercept2:.2f}', fontsize=12, color='red')
Out[ ]:
Text(10, 25, 'Regression 2: GS = -0.24 * SR + 15.16')
No description has been provided for this image

Figure 19: Simple linear regression result with two different choices of independent variables and the major axis.¶

In [ ]:
# Define variables (same as Figure 5)
grainsize = np.array([15, 14.8, 14.5, 13.8, 12.5, 12.3, 10.1, 9.5, 8.8, 8.4, 7.5, 7.1, 6.5])
sedrate = np.array([8.2, 21.1, 8.3, 8.9, 10.1, 18.3, 17.9, 22.5, 14.5, 28.6, 29.1, 35, 15])

# Perform linear regression in two directions
slope1, intercept1 = np.polyfit(grainsize, sedrate, 1)
slope2, intercept2 = np.polyfit(sedrate, grainsize, 1)

# Find principle axis
X = np.column_stack((grainsize, sedrate)) # combine variables in array
pca = PCA(n_components = 2) # Initiate PCA for two variables
pca.fit(X) # Use data in X to find principle axes
slope3 = pca.components_[0, 1] / pca.components_[0, 0] # Find slope of the principle axis
intercept3 = np.mean(sedrate) - slope3 * np.mean(grainsize) # Find intercept of the principle axis

# Find reduced major axis
slope4 = np.sign(pearsonr(grainsize, sedrate)[0]) * np.std(sedrate) / np.std(grainsize) # Find slope of the reduced major axis
intercept4 = np.mean(sedrate) - slope4 * np.mean(grainsize) # Find intercept of the reduced major axis

# Create scatterplot
plt.scatter(grainsize, sedrate, marker = '+', s = 100, color = "red") # Plot points
plt.plot(np.array([5, 20]), slope1 * np.array([5, 20]) + intercept1, color='black', label='Linear Regression 1: sedimentation rate ~ grain size')
plt.plot(np.array([5, 20]), (np.array([5, 20]) - intercept2) / slope2, color='black', label='Linear Regression 2: grain size ~ Sedimentation rate')
plt.plot(np.array([5, 20]), slope3 * np.array([5, 20]) + intercept3, color='red', label='Principle axis')
plt.plot(np.array([5, 20]), slope4 * np.array([5, 20]) + intercept4, color='blue', label='Rediced Major axis')
plt.xlabel('grainsize (GS; mm)')
plt.ylabel('sedimentation rate (SR; cm/kyr)')
plt.xlim(5, 20)
plt.ylim(5, 40)

# Print the formula for the regression line
plt.text(10, 30, f'Regression 1: SR = {slope1:.2f} * GS + {intercept1:.2f}', fontsize=12, color='black')
plt.text(10, 27.5, f'Principle Axis: SR = {slope3:.2f} * GS + {intercept3:.2f}', fontsize=12, color='red')
plt.text(10, 25, f'Reduced Major Axis: SR = {slope4:.2f} * GS + {intercept4:.2f}', fontsize=12, color='blue')
plt.text(10, 22.5, f'Regression 2: GS = {slope2:.2f} * SR + {intercept2:.2f}', fontsize=12, color='black')
Out[ ]:
Text(10, 22.5, 'Regression 2: GS = -0.24 * SR + 15.16')
No description has been provided for this image

Figure 21: Linear regression with a binary independent variable¶

In [ ]:
# Define variables
nationality = np.array(["Dutch", "Dutch", "Dutch", "Dutch", "Dutch", "Dutch", "non-Dutch", "non-Dutch", "non-Dutch", "non-Dutch", "non-Dutch", "non-Dutch", "non-Dutch"])
height = np.array([175, 165, 169, 189, 201, 195, 159, 173, 179, 166, 155, 168, 170])

# Create dummy variable
dummy = np.where(nationality == "Dutch", 0, 1)

# Perform linear regression with dummy variable
slope, intercept = np.polyfit(dummy, height, 1)

# Check the correlation coefficient and significance of the regression
correlation_coefficient, p_value = pearsonr(dummy, height)

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(10, 4))

# Plot A
axs[0].scatter(nationality, height, marker = '+', s = 100, color = "red") # Plot points
axs[0].set_title('A. Plot with binary variable')
axs[0].set_xlabel('Nationality')
axs[0].set_ylabel('height (cm)')
axs[0].set_ylim(150, 210)

# Plot B
axs[1].scatter(dummy, height, marker = '+', s = 100, color = "red") # Plot points
plt.plot(np.array([-0.1, 1.1]), slope * np.array([-0.1, 1.1]) + intercept, color='black') # Plot linear regression result
axs[1].set_title('B. Plot with dummy variable and linear regression')
axs[1].set_xlabel('Dummy (0 = Dutch, 1 = non-Dutch)')
axs[1].set_ylabel('height (cm)')
axs[1].set_ylim(150, 210)
axs[1].set_xlim(-0.1, 1.1)

# # Print the formula for the regression line
plt.text(0.1, 190, f'Height = {slope:.2f} * dummy + {intercept:.1f}\nPearsons r = {correlation_coefficient:.2f}\np-value = {p_value:.2f}', fontsize=12, color='black')
Out[ ]:
Text(0.1, 190, 'Height = -15.19 * dummy + 182.3\nPearsons r = -0.58\np-value = 0.04')
No description has been provided for this image

Figure 22: Linear regression with a binary dependent variable (does not work)¶

In [ ]:
# Define variables
years_smoking1 = np.array([12, 26, 42, 24, 80, 3, 12, 64, 21, 14, 8, 40, 8])
developed_lung_cancer1 = np.array(["yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no"])
years_smoking2 = np.array([29, 39, 42, 49, 40, 3, 12, 24, 21, 14, 8, 30, 8])
developed_lung_cancer2 = np.array(["yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no"])

# Create dummy1 variable
dummy1 = np.where(developed_lung_cancer1 == "no", 0, 1)
dummy2 = np.where(developed_lung_cancer2 == "no", 0, 1)

# Perform linear regression with dummy1 variable
slope1, intercept1 = np.polyfit(years_smoking1, dummy1, 1)
slope2, intercept2 = np.polyfit(years_smoking2, dummy2, 1)

# Check the correlation coefficient and significance of the regression
correlation_coefficient1, p_value1 = pearsonr(years_smoking1, dummy1)
correlation_coefficient2, p_value2 = pearsonr(years_smoking2, dummy2)

# Create subplots
fig, axs = plt.subplots(1, 3, figsize=(15, 4))

# Plot A
axs[0].scatter(years_smoking1, developed_lung_cancer1, marker = '+', s = 100, color = "red") # Plot points
axs[0].set_title('A. Plot with binary dependent variable')
axs[0].set_xlabel('Years of smoking')
axs[0].set_ylabel('Developed lung cancer')
axs[0].invert_yaxis()
axs[0].set_xlim(0, 100)

# Plot B
axs[1].scatter(years_smoking1, dummy1, marker = '+', s = 100, color = "red") # Plot points
axs[1].plot(np.array([0, 100]), slope1 * np.array([0, 100]) + intercept1, color='black') # Plot linear regression result
axs[1].set_title('B. Plot with dummy variable as\ndependent variable and linear regression')
axs[1].set_xlabel('Years of smoking')
axs[1].set_ylabel('Dummy1 (0 = no, 1 = yes)')
axs[1].set_ylim(-0.1, 1.1)
axs[1].set_xlim(0, 100)
axs[1].text(5, 0.7, f'Lung cancer? = {slope1:.2f} * dummy + {intercept1:.2f}\nPearsons r = {correlation_coefficient1:.2f}\np-value = {p_value1:.2f}', fontsize=12, color='black')

# Plot C
axs[2].scatter(years_smoking2, dummy2, marker = '+', s = 100, color = "red") # Plot points
axs[2].plot(np.array([0, 100]), slope2 * np.array([0, 100]) + intercept2, color='black') # Plot linear regression result
axs[2].set_title('C. Plot with dummy variable as\ndependent variable and linear regression')
axs[2].set_xlabel('Years of smoking')
axs[2].set_ylabel('Dummy (0 = no, 1 = yes)')
axs[2].set_ylim(-0.1, 1.1)
axs[2].set_xlim(0, 100)
axs[2].text(5, 0.7, f'Lung cancer? = {slope2:.2f} * dummy {intercept2:.2f}\nPearsons r = {correlation_coefficient2:.2f}\np-value = {p_value2:.2f}', fontsize=12, color='black')
Out[ ]:
Text(5, 0.7, 'Lung cancer? = 0.03 * dummy -0.31\nPearsons r = 0.84\np-value = 0.00')
No description has been provided for this image

Figure 23: The basic sigmoid function¶

In [ ]:
# Define variables
x = np.arange(-10, 10, 0.1)
y = 1 / (1 + np.exp(-1 * x))
y_neg = 1 - y

# Plot sigmoid curve
# plt.scatter(x, y, marker = '+', s = 100, color = "red") # Plot points
plt.plot(x, y, color = "red") # Plot line
plt.plot(x, y_neg, color = "blue") # Plot line
plt.xlabel('X')
plt.ylabel('Y')
Out[ ]:
Text(0, 0.5, 'Y')
No description has been provided for this image

Figure 24: Logistic regression with a binary dependent variable¶

In [ ]:
# Define variables
years_smoking1 = np.array([12, 26, 42, 24, 80, 3, 12, 64, 21, 14, 8, 40, 8])
developed_lung_cancer1 = np.array(["yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no"])
years_smoking2 = np.array([29, 39, 42, 49, 40, 3, 12, 24, 21, 14, 8, 30, 8])
developed_lung_cancer2 = np.array(["yes", "yes", "yes", "yes", "yes", "no", "no", "no", "no", "no", "no", "no", "no"])

# Create dummy1 variable
dummy1 = np.where(developed_lung_cancer1 == "no", 0, 1)
dummy2 = np.where(developed_lung_cancer2 == "no", 0, 1)

# Perform logistic regression
# Example 1
years_smoking_with_constant1 = sm.add_constant(years_smoking1) # Add constant to the independent variable
logit_model1 = sm.Logit(dummy1, years_smoking_with_constant1) # Perform regression
logit_result1 = logit_model1.fit()
print(logit_result1.summary())

# Example 2
years_smoking_with_constant2 = sm.add_constant(years_smoking2) # Add constant to the independent variable
logit_model2 = sm.Logit(dummy2, years_smoking_with_constant2) # Perform regression
logit_result2 = logit_model2.fit()

# Generate x values for plotting model outcome
X_test = np.linspace(0, 100, 200)
X_test_with_constant = sm.add_constant(X_test)

# Predict the model outcomes for plotting
y1 = logit_result1.predict(X_test_with_constant)
y2 = logit_result2.predict(X_test_with_constant)

# Create subplots
fig, axs = plt.subplots(1, 2, figsize = (10, 4))

# Plot A
axs[0].scatter(years_smoking1, dummy1, marker = '+', s = 100, color = "red") # Plot points
axs[0].plot(X_test, y1, color='black') # Plot linear regression result
axs[0].set_title('A. Plot with dummy variable as\ndependent variable and logistic regression')
axs[0].set_xlabel('Years of smoking')
axs[0].set_ylabel('Dummy1 (0 = no, 1 = yes)')
axs[0].set_ylim(-0.1, 1.1)
axs[0].set_xlim(0, 100)
axs[0].text(5, 0.7, r'$p({{lung cancer}}) = \frac{1}{1 + e^{-(%.2f + %.2f \cdot {{years}})}}$' % (logit_result1.params[0], logit_result1.params[1]))

# Plot B
axs[1].scatter(years_smoking2, dummy2, marker = '+', s = 100, color = "red") # Plot points
axs[1].plot(X_test, y2, color='black') # Plot linear regression result
axs[1].set_title('B. Plot with dummy variable as\ndependent variable and logistic regression')
axs[1].set_xlabel('Years of smoking')
axs[1].set_ylabel('Dummy (0 = no, 1 = yes)')
axs[1].set_ylim(-0.1, 1.1)
axs[1].set_xlim(0, 100)
axs[1].text(35, 0.7, r'$p({{lung cancer}}) = \frac{1}{1 + e^{-(%.2f + %.2f \cdot {{years}})}}$' % (logit_result2.params[0], logit_result2.params[1]))
Optimization terminated successfully.
         Current function value: 0.609180
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                   13
Model:                          Logit   Df Residuals:                       11
Method:                           MLE   Df Model:                            1
Date:                Thu, 01 Aug 2024   Pseudo R-squ.:                 0.08570
Time:                        08:20:30   Log-Likelihood:                -7.9193
converged:                       True   LL-Null:                       -8.6616
Covariance Type:            nonrobust   LLR p-value:                    0.2231
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.3714      0.992     -1.382      0.167      -3.316       0.573
x1             0.0321      0.028      1.143      0.253      -0.023       0.087
==============================================================================
Optimization terminated successfully.
         Current function value: 0.133948
         Iterations 10
Out[ ]:
Text(35, 0.7, '$p({{lung cancer}}) = \\frac{1}{1 + e^{-(-15.08 + 0.51 \\cdot {{years}})}}$')
No description has been provided for this image

Figure 25: Logistic distribution compared with normal distribution¶

In [ ]:
# Define variables
x = np.arange(-5, 5, 0.1)
y_log = np.exp(-1 * x) / ((1 + np.exp(-1 * x)) ** 2)
y_nor = 1 / (np.sqrt(2 * np.pi)) * np.exp(-0.5 * x ** 2)

# Plot sigmoid curve
# plt.scatter(x, y, marker = '+', s = 100, color = "red") # Plot points
plt.plot(x, y_log, color = "red") # Plot line
plt.plot(x, y_nor, color = "blue") # Plot line
plt.xlabel('X')
plt.ylabel('Y')
Out[ ]:
Text(0, 0.5, 'Y')
No description has been provided for this image

Table 10: Correlation matrics of a regular and closed random datasets¶

In [ ]:
# Generate some random data: 4 variables with 100 observations
np.random.seed(0) # Set seed
df = pd.DataFrame({'A' : np.random.rand(100),
                  'B' : np.random.rand(100),
                  'C' : np.random.rand(100),
                  'D' : np.random.rand(100)}
)

# Calculate the closed-sum equivalent of the data 
dfclosed = df.div(df.sum(axis = 1), axis = 0)

# Calculate and plot correlation matrices
print(df.corr())
print(dfclosed.corr())
          A         B         C         D
A  1.000000 -0.066107 -0.036512  0.021895
B -0.066107  1.000000 -0.124047 -0.061987
C -0.036512 -0.124047  1.000000 -0.115117
D  0.021895 -0.061987 -0.115117  1.000000
          A         B         C         D
A  1.000000 -0.302010 -0.368956 -0.247604
B -0.302010  1.000000 -0.372964 -0.355542
C -0.368956 -0.372964  1.000000 -0.346469
D -0.247604 -0.355542 -0.346469  1.000000

Figure 26: Example of a bivariate multi-normal distribution¶

In [ ]:
# Generate dataset
np.random.seed(0) # Set seed

# Parameters to set
mu_Temperature = 15
sd_Temperature = 3
variance_Temperature = sd_Temperature ** 2 # Variance is square of standard deviation

mu_Precipitation = 750
sd_Precipitation = 200
variance_Precipitation = sd_Precipitation ** 2 # Variance is square of standard deviation

multivar = np.random.multivariate_normal(
    [mu_Temperature, mu_Precipitation],
    [[variance_Temperature, sd_Temperature * sd_Precipitation * 0.5], [sd_Temperature * sd_Precipitation * 0.5, variance_Precipitation]], # Simulate correlation of 0.5
    1000
).T

# Plot histograms and scatterplot

# Create subplots
fig, axs = plt.subplots(1, 3, figsize = (15, 5))
axs = axs.ravel()

# Plot A: histogram of temperature
axs[0].hist(multivar[0], color = "red") # Plot histogram
axs[0].set_title('A. Histogram of temperature data')
axs[0].set_xlabel('Temperature (degrees C)')
axs[0].set_ylabel('Frequency')
axs[0].set_xlim(0, 30)

# Plot B
axs[1].hist(multivar[1], color = "blue") # Plot histogram
axs[1].set_title('B. Histogram of precipitation data')
axs[1].set_xlabel('Precipitation (mm/yr)')
axs[1].set_ylabel('Frequency')
axs[1].set_xlim(0, 1500)

# Plot C
axs[2].scatter(multivar[0], multivar[1], marker = '.', s = 100, color = "black") # Plot histogram
axs[2].set_title('C. Scatter plot of temperature and precipitation data')
axs[2].set_ylabel('Precipitation (mm/yr)')
axs[2].set_xlabel('Temperature (degrees C)')
axs[2].set_ylim(0, 1500)
axs[2].set_xlim(0, 30)
Out[ ]:
(0.0, 30.0)
No description has been provided for this image

Figure 27: 3D plot of a bivariate multi-normal distribution with sampling¶

In [ ]:
# Parameters to set
mu_Temperature = 15
sd_Temperature = 3
variance_Temperature = sd_Temperature ** 2 # Variance is square of standard deviation

mu_Precipitation = 750
sd_Precipitation = 200
variance_Precipitation = sd_Precipitation ** 2 # Variance is square of standard deviation

# Generate 1000 samples from multivariate normal
samples = np.random.multivariate_normal([mu_Temperature, mu_Precipitation],
    [[variance_Temperature, sd_Temperature * sd_Precipitation * 0.5], [sd_Temperature * sd_Precipitation * 0.5, variance_Precipitation]], # Simulate correlation of 0.5
    1000
)

# Create grid
x = np.linspace(0, 30, 30)
y = np.linspace(0, 1500, 30)
X, Y = np.meshgrid(x,y)

# Create a 2D histogram (binning the samples)
hist, x, y = np.histogram2d(samples[:, 0], samples[:, 1], bins = (x, y))

# Compute the bin centers
x_centers = (x[:-1] + x[1:]) / 2
y_centers = (y[:-1] + y[1:]) / 2
X, Y = np.meshgrid(x_centers, y_centers)

# Create multivariate normal surface
pos = np.empty(X.shape + (2, ))
pos[:, :, 0] = X; pos[:, :, 1] = Y
multivar = multivariate_normal(
    [mu_Temperature, mu_Precipitation],
    [[variance_Temperature, sd_Temperature * sd_Precipitation * 0.5], [sd_Temperature * sd_Precipitation * 0.5, variance_Precipitation]] # Simulate correlation of 0.5
)

# Create 3D figure with subplots
fig = plt.figure(figsize = plt.figaspect(0.5))

# A. 3D plot of multivariate histogram of samples
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_surface(X, Y, hist.T, cmap='viridis', edgecolor='none')
ax.set_title('A. 3D histogram of\ntemperature and precipitation data')
ax.set_xlabel('Temperature (degrees C)')
ax.set_ylabel('Precipitation (mm)')
ax.set_zlabel('Frequency')

# B. 3D plot of smooth multivariate normal
ax = fig.add_subplot(1, 2, 2, projection='3d')
ax.plot_surface(X, Y, multivar.pdf(pos), cmap = 'viridis', linewidth = 0)
ax.set_title('B. 3D surface of the multivariate normal distribution of\ntemperature and precipitation data')
ax.set_xlabel('Temperature (degrees C)')
ax.set_ylabel('Precipitation (mm)')
ax.set_zlabel('Frequency')
plt.show()
No description has been provided for this image

Figure 28: 3D plot of a bivariate multi-normal distribution with contour lines¶

In [ ]:
# Parameters to set
mu_Temperature = 15
sd_Temperature = 3
variance_Temperature = sd_Temperature ** 2 # Variance is square of standard deviation

mu_Precipitation = 750
sd_Precipitation = 200
variance_Precipitation = sd_Precipitation ** 2 # Variance is square of standard deviation

# Create grid
x = np.linspace(0, 30, 30)
y = np.linspace(0, 1500, 30)
X, Y = np.meshgrid(x,y)

# Create multivariate normal surface
pos = np.empty(X.shape + (2, ))
pos[:, :, 0] = X; pos[:, :, 1] = Y
multivar = multivariate_normal(
    [mu_Temperature, mu_Precipitation],
    [[variance_Temperature, sd_Temperature * sd_Precipitation * 0.5], [sd_Temperature * sd_Precipitation * 0.5, variance_Precipitation]] # Simulate correlation of 0.5
)

# Create 3D figure with subplots
fig = plt.figure(figsize = plt.figaspect(0.4))

# A. 3D plot of smooth multivariate normal
ax = fig.add_subplot(1, 2, 1, projection = '3d')
ax.plot_surface(X, Y, multivar.pdf(pos), cmap = 'viridis', linewidth = 0)
ax.set_title('A. 3D surface of the multivariate normal distribution of\ntemperature and precipitation data')
ax.set_xlabel('Temperature (degrees C)')
ax.set_ylabel('Precipitation (mm)')
ax.set_zlabel('Frequency')

# B. Contour plot of smooth multivariate normal
contour_levels = [0.00001, 0.000025, 0.00005, 0.000075, 0.0001, 0.000125, 0.00015, 0.000175, 0.0002, 0.000225, 0.00025, 0.000275]  # Set levels for ellipses
ax = fig.add_subplot(1, 2, 2)
ax.contour(X, Y, multivar.pdf(pos), levels = contour_levels, cmap = 'viridis', linewidths = 1.5)
ax.set_title('B. Contour ellipses of the multivariate normal distribution of\ntemperature and precipitation data')
ax.set_xlabel('Temperature (degrees C)')
ax.set_ylabel('Precipitation (mm)')
plt.show()
No description has been provided for this image

Figure 29: Plot of a dataset consisting of multiple groups of datapoints¶

In [ ]:
# Create dataset by sampling from three multivariate normal distributions
np.random.seed(0) # Set seed

samples1 = np.random.multivariate_normal([3, 3],
    [[0.25, 0.5 * 0.6 * 0.3], [0.5 * 0.6 * 0.3, 0.36]], # Simulate correlation of 0.5
    200
)
samples2 = np.random.multivariate_normal([5, 2],
    [[0.25, 0.5 * 0.3 * 0.7], [0.5 * 0.3 * 0.7, 0.09]], # Simulate correlation of 0.5
    200
)
samples3 = np.random.multivariate_normal([6, 7],
    [[0.25, 0.5 * 0.6 * -0.3], [0.5 * 0.6 * -0.3, 0.36]], # Simulate correlation of 0.5
    200
)

# Plot scatterplot panels

# Create subplots
fig, axs = plt.subplots(1, 2, figsize = (12, 5))

# Plot A scatterplot
axs[0].scatter(samples1[:, 0], samples1[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[0].scatter(samples2[:, 0], samples2[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[0].scatter(samples3[:, 0], samples3[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[0].scatter(5.2, 3.8, marker = '+', s = 100, color = "red") # Plot histogram
axs[0].set_title('A. Scatter plot of data with three clusters')
axs[0].set_ylabel('X2')
axs[0].set_xlabel('X1')
axs[0].set_ylim(0, 9)
axs[0].set_xlim(1, 8)
axs[0].text(2.7, 4.5, 'A', size = 18)
axs[0].text(6.5, 2.2, 'B', size = 18)
axs[0].text(4.5, 7.5, 'C', size = 18)

# Add annotation with arrows
axs[0].annotate('', xy = (5.2, 2.5), xytext = (5.2, 3.6), arrowprops = dict(arrowstyle = '->', color = 'red'))
axs[0].annotate('', xy = (4.2, 3.8), xytext = (5.0, 3.8), arrowprops = dict(arrowstyle = '->', color = 'red'))
axs[0].annotate('', xy = (6, 5.5), xytext = (5.3, 4), arrowprops = dict(arrowstyle = '->', color = 'red'))

# Plot B scatterplot with rectangles and arrows
axs[1].scatter(samples1[:, 0], samples1[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[1].scatter(samples2[:, 0], samples2[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[1].scatter(samples3[:, 0], samples3[:, 1], marker = '+', s = 10, color = "blue") # Plot histogram
axs[1].scatter(5.2, 3.8, marker = '+', s = 100, color = "red") # Plot histogram
axs[1].set_title('A. Scatter plot of data with three clusters with rectangles')
axs[1].set_ylabel('X2')
axs[1].set_xlabel('X1')
axs[1].set_ylim(0, 9)
axs[1].set_xlim(1, 8)
axs[1].text(2.0, 4.0, 'A', size = 18)
axs[1].text(6.1, 1.0, 'B', size = 18)
axs[1].text(4.8, 5.5, 'C', size = 18)

# Add annotation with arrows
axs[1].annotate('', xy = (5.2, 2.5), xytext = (5.2, 3.6), arrowprops = dict(arrowstyle = '->', color = 'red'))
axs[1].annotate('', xy = (4.2, 3.8), xytext = (5.0, 3.8), arrowprops = dict(arrowstyle = '->', color = 'red'))
axs[1].annotate('', xy = (6, 5.5), xytext = (5.3, 4), arrowprops = dict(arrowstyle = '->', color = 'red'))

# Add rectangles
rectA = patches.Rectangle(xy = (1.8, 0.8), width = 2.5, height = 3.9, edgecolor = 'black', facecolor = 'none')
rectB = patches.Rectangle(xy = (3.5, 0.7), width = 3.0, height = 2.3, edgecolor = 'black', facecolor = 'none')
rectC = patches.Rectangle(xy = (4.7, 5.2), width = 2.7, height = 3.7, edgecolor = 'black', facecolor = 'none')
axs[1].add_patch(rectA)
axs[1].add_patch(rectB)
axs[1].add_patch(rectC)
Out[ ]:
<matplotlib.patches.Rectangle at 0x190870e4910>
No description has been provided for this image

Figure 30: 3D plot of a dataset consisting of three groups of datapoints plotted as multivariate normal distributions¶

In [ ]:
# Create dataset by sampling from three multivariate normal distributions
np.random.seed(0) # Set seed

samples1 = np.random.multivariate_normal([3, 3],
    [[0.25, 0.5 * 0.6 * 0.3], [0.5 * 0.6 * 0.3, 0.36]], # Simulate correlation of 0.5
    200
)
samples2 = np.random.multivariate_normal([5, 2],
    [[0.25, 0.5 * 0.3 * 0.7], [0.5 * 0.3 * 0.7, 0.09]], # Simulate correlation of 0.5
    200
)
samples3 = np.random.multivariate_normal([6, 7],
    [[0.25, 0.5 * 0.6 * -0.3], [0.5 * 0.6 * -0.3, 0.36]], # Simulate correlation of 0.5
    200
)

# Create grid
x = np.linspace(1, 8, 70)
y = np.linspace(0, 9, 90)
X, Y = np.meshgrid(x, y)

# Create multivariate normal surfaces
pos = np.empty(X.shape + (2, ))
pos[:, :, 0] = X; pos[:, :, 1] = Y
multivar1 = multivariate_normal(
    [3, 3],
    [[0.25, 0.5 * 0.6 * 0.3], [0.5 * 0.6 * 0.3, 0.36]]
)
multivar2 = multivariate_normal(
    [5, 2],
    [[0.25, 0.5 * 0.3 * 0.7], [0.5 * 0.3 * 0.7, 0.09]]
)
multivar3 = multivariate_normal(
    [6, 7],
    [[0.25, 0.5 * 0.6 * -0.3], [0.5 * 0.6 * -0.3, 0.36]]
)

# Create 3D figure with subplots
fig = plt.figure(figsize = plt.figaspect(0.4))

# A. 3D plot of smooth multivariate normal
ax = fig.add_subplot(1, 2, 1, projection = '3d')
ax.plot_surface(X, Y, multivar1.pdf(pos) + multivar2.pdf(pos) + multivar3.pdf(pos), cmap = 'viridis', linewidth = 0)
ax.set_title('A. 3D surface of the\nmultivariate normal distributions')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.set_zlabel('Frequency')

# B. Contour plot of smooth multivariate normal
contour_levels = [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1, 0.15, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]  # Set levels for ellipses
ax = fig.add_subplot(1, 2, 2)
ax.contour(X, Y, multivar1.pdf(pos) + multivar2.pdf(pos) + multivar3.pdf(pos), levels = contour_levels, cmap = 'viridis', linewidths = 1.5)
ax.set_title('B. Contour ellipses of the\nmultivariate normal distributions')
ax.set_xlabel('X1')
ax.set_ylabel('X2')

# Add group labels
ax.text(1.5, 5.5, 'A', size = 18)
ax.text(6.1, 1.0, 'B', size = 18)
ax.text(3.2, 7.0, 'C', size = 18)

# Add red sample and annotation with arrows
ax.scatter(5.2, 3.8, marker = '+', s = 100, color = "red") # Plot histogram
ax.annotate('', xy = (5.2, 2.5), xytext = (5.2, 3.6), arrowprops = dict(arrowstyle = '->', color = 'red'))
ax.annotate('', xy = (4.2, 3.8), xytext = (5.0, 3.8), arrowprops = dict(arrowstyle = '->', color = 'red'))
ax.annotate('', xy = (6, 5.5), xytext = (5.3, 4), arrowprops = dict(arrowstyle = '->', color = 'red'))
plt.show()
No description has been provided for this image

Figure 31: Example of hierarchical clustering with 12 datapoints and two variables¶

In [ ]:
# Generate a small dataset with 12 data points and 2 variables
np.random.seed(1)  # For reproducibility
data = np.random.rand(12, 2) * 10  # 12 points in 2D space

# Perform hierarchical/agglomerative clustering
linked = linkage(data, method = 'single')

# Plot the scatter plot of the data points
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.scatter(data[:, 0], data[:, 1], c='blue', s=100)
for i in range(data.shape[0]):
    plt.text(data[i, 0] + 0.3, data[i, 1], f'P{i+1}', fontsize=12) # Add point labels
plt.title('A. scatter plot of data points')
plt.xlabel('Variable 1')
plt.ylabel('Variable 2')
plt.annotate('', xy = (data[1, 0], data[1, 1]), xytext = (data[6, 0], data[6, 1]), arrowprops = dict(arrowstyle = '<->', color = 'red'))

# Plot the dendrogram
plt.subplot(1, 2, 2)
dendrogram(
    linked,
    labels=[f'P{i + 1}' for i in range(data.shape[0])],
    leaf_rotation = 90,
    color_threshold = 5
)
plt.title('B. Dendrogram of hierarchically clustered data points')
plt.xlabel('Data Points')
plt.ylabel('Distance')

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 32: Example of dendrogram with cutoff line¶

In [ ]:
# Generate a small dataset with 12 data points and 2 variables
np.random.seed(1)  # For reproducibility
data = np.random.rand(12, 2) * 10  # 12 points in 2D space

# Perform hierarchical/agglomerative clustering
linked = linkage(data, method = 'single')

# Plot the scatter plot of the data points
plt.figure(figsize=(10, 5))

# Plot the dendrogram
plt.subplot(1, 2, 2)
dendrogram(
    linked,
    labels=[f'P{i + 1}' for i in range(data.shape[0])],
    leaf_rotation = 90,
    color_threshold = 2
)
plt.axhline(2, linestyle='--', color = 'red')
plt.title('Dendrogram of hierarchically clustered data points')
plt.xlabel('Data Points')
plt.ylabel('Distance')

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 33: Hierarchical clustering with different similarity metrics¶

In [ ]:
# Generate a small dataset with 12 data points and 2 variables
np.random.seed(2)  # For reproducibility
data = np.random.rand(12, 2) * 10 - 5  # 12 points in 2D space

# Define different distance metrics
distance_metrics = {
    'Euclidean': 'euclidean',
    'Manhattan': 'cityblock',
    'Pearson': 'correlation'
}

# Create a figure with 1 row and 4 columns for the scatter plot + 3 dendrograms
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Scatter plot of the data points (Top-left corner)
axes[0, 0].scatter(data[:, 0], data[:, 1], c='blue', s=100)
for i in range(data.shape[0]):
    axes[0, 0].text(data[i, 0] + 0.2, data[i, 1], f'P{i+1}', fontsize=12)
axes[0, 0].set_title('Scatter Plot of Data Points')
axes[0, 0].set_xlabel('Variable 1')
axes[0, 0].set_ylabel('Variable 2')

# Perform hierarchical clustering and plot dendrograms for each distance metric
for i, (metric_name, metric) in enumerate(distance_metrics.items()):
    
    # Perform hierarchical clustering
    linked = linkage(data, method = 'single', metric = metric)
    
    # Determine the subplot location
    ax_row = (i + 1) // 2
    ax_col = (i + 1) % 2
    
    # Plot the dendrogram
    dendrogram(linked, labels=[f'P{i+1}' for i in range(data.shape[0])], leaf_rotation=90, ax=axes[ax_row, ax_col])
    axes[ax_row, ax_col].set_title(f'Dendrogram ({metric_name} Distance)')
    axes[ax_row, ax_col].set_xlabel('Data Points')
    axes[ax_row, ax_col].set_ylabel('Distance')

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 44: Example of a time series with time at regular intervals on the horizontal axis¶

In [ ]:
# Load data
data = loadmat('TSA_example_Dinkel.mat')

# Extract time series
df = pd.DataFrame(np.concatenate((data['t'], data['data']), axis=1), columns=['t', 'data'])

# Plot time series
plt.figure(1, figsize = (12, 5)) # Create plot
plt.plot(df['t'], df['data'], linewidth = 0.7) # Plot the drainage data variable against variable t
plt.xlabel('time (days)') # Label x axis
plt.ylabel(r'$drainage~(m^3*s^{-1})$') # Label y axis
plt.title('Daily drainage of the Dinkel river')
Out[ ]:
Text(0.5, 1.0, 'Daily drainage of the Dinkel river')
No description has been provided for this image

Figure 47: Timeseries generated using different mathematical models¶

In [ ]:
np.random.seed(1) # Set seed for reproducibility

# Number of time points
n = 200

# Generate time series with different characteristics
# 1. Random noise time series
ts1 = np.random.normal(0, 1, n)

# 2. Red noise (autocorrelated) time series
ts2 = np.zeros(n)
alpha = 0.8 # Set factor with which to multiply
ts2[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts2[t] = alpha * ts2[t-1] + np.random.normal(0, 1) # Multiply previous value by the factor alpha and add random number

# 3. Combination of sinusoids time series
t = np.arange(n) # Create time vector
ts3 = 3 * np.sin(t / 10) + np.sin(t / 5)

# 4. Random walk time series
ts4 = np.zeros(n)
ts4[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts4[t] = ts4[t-1] + np.random.normal(0, 0.2) # Add random number to the previous value to obtain the next number

# Plot all time series
plt.figure(figsize=(12, 10))

plt.subplot(4, 1, 1)
plt.plot(ts1, label='Random Noise')
plt.title('Time series 1: Random Noise Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.ylim(-6, 6)

plt.subplot(4, 1, 2)
plt.plot(ts2, label='Red Noise (Autocorrelated)', color='orange')
plt.title('Time series 2: Red Noise Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.ylim(-6, 6)

plt.subplot(4, 1, 3)
plt.plot(ts3, label='Combination of Sinusoids', color='green')
plt.title('Time series 3: Combination of Sinusoids Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.ylim(-6, 6)

plt.subplot(4, 1, 4)
plt.plot(ts4, label='Random Walk', color='red')
plt.title('Time series 4: Random Walk Time Series')
plt.xlabel('Time')
plt.ylabel('Value')
plt.ylim(-6, 6)

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 48: Plot autocorrelation for timeseries generated using different mathematical models¶

In [ ]:
np.random.seed(1) # Set seed for reproducibility

# Number of time points
n = 200

# Generate time series with different characteristics
# 1. Random noise time series
ts1 = np.random.normal(0, 1, n)

# 2. Red noise (autocorrelated) time series
ts2 = np.zeros(n)
alpha = 0.8 # Set factor with which to multiply
ts2[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts2[t] = alpha * ts2[t-1] + np.random.normal(0, 1) # Multiply previous value by the factor alpha and add random number

# 3. Combination of sinusoids time series
t = np.arange(n) # Create time vector
ts3 = 3 * np.sin(t / 10) + np.sin(t / 5)

# 4. Random walk time series
ts4 = np.zeros(n)
ts4[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts4[t] = ts4[t-1] + np.random.normal(0, 0.2) # Add random number to the previous value to obtain the next number

# plot the autocorrelation functions for the timeseries

# Plot all time series
# Create subplots

# Plot all time series
plt.figure(figsize=(12, 12))
plt.tight_layout(pad = 50)
plt.subplots_adjust(hspace = 0.5)

plt.subplot(4, 1, 1)
plt.acorr(ts1, maxlags = 100)
plt.title('Time series 1: Random Noise Time Series')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.ylim(-1, 1)

plt.subplot(4, 1, 2)
plt.acorr(ts2, maxlags = 100)
plt.title('Time series 2: Red Noise Time Series')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.ylim(-1, 1)

plt.subplot(4, 1, 3)
plt.acorr(ts3, maxlags = 100)
plt.title('Time series 3: Combination of Sinusoids')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.ylim(-1, 1)

plt.subplot(4, 1, 4)
plt.acorr(ts4, maxlags = 100)
plt.title('Time series 4: Random Walk Time Series')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.ylim(-1, 1)

# plot_acf(ts1, lags = np.arange(-101, 101, 1), alpha = None, ax = ax[0], negative_lags = 1) # Autocorrelation function of time series 1
# plot_acf(ts2, lags = np.arange(-101, 101, 1), alpha = None, ax = ax[1]) # Autocorrelation function of time series 2
# plot_acf(ts3, lags = np.arange(-101, 101, 1), alpha = None, ax = ax[2]) # Autocorrelation function of time series 3
# plot_acf(ts4, lags = np.arange(-101, 101, 1), alpha = None, ax = ax[3]) # Autocorrelation function of time series 4
Out[ ]:
(-1.0, 1.0)
No description has been provided for this image

Figure 49: Plot of periodic timeseries with noise¶

In [ ]:
# Set the random seed for reproducibility
np.random.seed(1)

# Number of time points
n = 200

# Generate the time points
t = np.arange(n)

# Generate a periodic time series with decreasing amplitude
decreasing_amplitude_series = np.exp(-t/50) * np.sin(t / 5)

# Generate a noisy version of the same time series
noise = np.random.normal(0, 0.2, n)
noisy_series = decreasing_amplitude_series + noise

# Plot the time series
plt.figure(figsize = (10, 6))

# Plot the original periodic time series with decreasing amplitude in red
plt.plot(t, decreasing_amplitude_series, label = 'Signal', color='red')

# Plot the noisy version in blue
plt.plot(t, noisy_series, label = 'Signal + noise', color = 'blue', alpha = 0.7)

# Add title and labels
plt.title('signal and noise')
plt.xlabel('Time')
plt.ylabel('Value')

# Show legend
plt.legend()

# Display the plot
plt.show()
No description has been provided for this image

Figure 50: Plot of periodic timeseries with noise including smoothing of different windows¶

In [ ]:
# Set the random seed for reproducibility
np.random.seed(1)

# Number of time points
n = 200

# Generate the time points
t = np.arange(n)

# Generate a periodic time series with decreasing amplitude
decreasing_amplitude_series = np.exp(-t/50) * np.sin(t / 5)

# Generate a noisy version of the same time series
noise = np.random.normal(0, 0.2, n)
noisy_series = decreasing_amplitude_series + noise

def centered_moving_average(series, window_size):
    pad_size = window_size // 2
    return np.convolve(series, np.ones(window_size)/window_size, mode='valid')

# Smoothing window sizes
window_size_3 = 3  # n = 1
window_size_11 = 11  # n = 5

# Compute the smoothed series and center them
smoothed_series_3 = np.concatenate((
    [np.nan] * (window_size_3 // 2),
    centered_moving_average(noisy_series, window_size_3),
    [np.nan] * (window_size_3 // 2)
))

smoothed_series_11 = np.concatenate((
    [np.nan] * (window_size_11 // 2),
    centered_moving_average(noisy_series, window_size_11),
    [np.nan] * (window_size_11 // 2)
))

# Plot the time series
plt.figure(figsize=(12, 10))

# Top plot with window size 3 (n=1)
plt.subplot(2, 1, 1)
plt.plot(t, decreasing_amplitude_series, label='Signal', color='red')
plt.plot(t, noisy_series, label='Signal + noise', color='lightgrey')
plt.plot(t, smoothed_series_3, label='Smoothed, n = 1', color='blue')
plt.title('Noisy Series with Moving Average (n = 1; Window Size = 3)')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

# Bottom plot with window size 11 (n=5)
plt.subplot(2, 1, 2)
plt.plot(t, decreasing_amplitude_series, label='Signal', color='red')
plt.plot(t, noisy_series, label='Signal + noise', color='lightgrey')
plt.plot(t, smoothed_series_11, label='Smoothed, n = 5', color='blue')
plt.title('Noisy Series with Moving Average (n = 5; Window Size = 11)')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

print(((window_size_11-1) / 2))

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
5.0
No description has been provided for this image

Figure 51: Example of a first and second order stationary time series¶

In [ ]:
# Generate data
np.random.seed(1) # Set seed for reproducibility

# Number of time points
n = 200

# Generate the time points
t = np.arange(n)

# Generate random noise time series
ts = np.random.normal(0, 1, n)

# Calculate means and standard deviations for each group of 10 points
def calc_means_stds(series, group_size = 10):
    means = []
    stds = []
    for i in range(0, len(series), group_size):
        group = series[i:i+group_size]
        means.append(np.mean(group))
        stds.append(np.std(group))
    return means, stds

# Calculate time series consisting of means and standard deviations
ts_means, ts_sds = calc_means_stds(ts, 10)

print(len(ts_means))

# Plot all time series
plt.figure(figsize = (12, 10))

plt.subplot(3, 1, 1)
plt.plot(t, ts)
plt.title('First and second order stationary time series')
plt.xlabel('t')
plt.ylabel('Value')
plt.ylim(-3.5, 3.5)

plt.subplot(3, 1, 2)
plt.plot(np.arange(5, 205, 10), ts_means)
plt.title('Mean of windows of 10 time steps')
plt.xlabel('t')
plt.ylabel('mean')
plt.ylim(-3.5, 3.5)

plt.subplot(3, 1, 3)
plt.plot(np.arange(5, 205, 10), ts_sds)
plt.title('Standard deviations of windows of 10 time steps')
plt.xlabel('t')
plt.ylabel('standard deviation')
plt.ylim(0, 2)
20
Out[ ]:
(0.0, 2.0)
No description has been provided for this image

Figure 52: Removing a linear trend from a time series¶

In [ ]:
# Generate data
np.random.seed(1) # Set seed for reproducibility

# Number of time points
n = 200

# Generate the time points
t = np.arange(n)

# Generate random noise time series with linear trend
ts_trend = 0.01 * t + np.random.normal(0, 1, n)

# Perform linear regression
slope, intercept = np.polyfit(t, ts_trend, 1)

print(slope)
print(intercept)

# Subtract trend from dataset
ts_detrended = ts_trend - (slope * t + intercept)

# # Create scatterplot
# plt.scatter(elev, OMsoil, marker = '+', s = 100, color = "red") # Plot points
# plt.plot(np.array([5, 20]), slope * np.array([5, 20]) + intercept, color='black', label='Linear Regression')


# Plot the time series
plt.figure(figsize=(12, 10))

# Top plot
plt.subplot(2, 1, 1)
plt.plot(t, ts_trend, label = 'Time series', color = 'blue')
plt.plot(t, slope * t + intercept, label = 'simple linear regression through time series', color = 'red')
plt.title('Time series with trend')
plt.xlabel('t')
plt.ylabel('Value')
plt.ylim(-2.5, 4.5)
plt.legend()

# Bottom plot
plt.subplot(2, 1, 2)
plt.plot(t, ts_detrended, label = 'Detrended time series', color = 'blue')
plt.title('Time series after detrending')
plt.xlabel('t')
plt.ylabel('Value')
plt.ylim(-2.5, 4.5)
plt.legend()
0.011713154113288394
-0.06377001942424641
Out[ ]:
<matplotlib.legend.Legend at 0x211661d8140>
No description has been provided for this image

Figure 57: Powerspectra of the four types of time series¶

In [ ]:
np.random.seed(1) # Set seed for reproducibility

# Number of time points
n = 200

# Generate time series with different characteristics
# 1. Random noise time series
ts1 = np.random.normal(0, 1, n)

# 2. Red noise (autocorrelated) time series
ts2 = np.zeros(n)
alpha = 0.8 # Set factor with which to multiply
ts2[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts2[t] = alpha * ts2[t-1] + np.random.normal(0, 1) # Multiply previous value by the factor alpha and add random number

# 3. Combination of sinusoids time series
t = np.arange(n) # Create time vector
ts3 = 3 * np.sin(t / 10) + np.sin(t / 5)

# 4. Random walk time series
ts4 = np.zeros(n)
ts4[0] = np.random.normal(0, 1) # Set first value
for t in range(1, n):
    ts4[t] = ts4[t-1] + np.random.normal(0, 0.2) # Add random number to the previous value to obtain the next number


# Function to compute the power spectrum
def compute_power_spectrum(series):
    fft_result = np.fft.fft(series)
    power_spectrum = np.abs(fft_result) ** 2
    frequencies = np.fft.fftfreq(len(series))
    return frequencies[:len(series)//2], power_spectrum[:len(series)//2]  # Return only the positive frequencies

# Compute power spectra
frequencies1, power1 = compute_power_spectrum(ts1)
frequencies2, power2 = compute_power_spectrum(ts2)
frequencies3, power3 = compute_power_spectrum(ts3)
frequencies4, power4 = compute_power_spectrum(ts4)

# Plot the power spectra

plt.figure(figsize=(12, 10))

# Random Noise Power Spectrum
plt.subplot(2, 2, 1)
plt.plot(frequencies1, power1, color='blue')
plt.title('Power Spectrum: Random Noise')
plt.xlabel('Frequency')
plt.ylabel('Power')
plt.ylim(0, 80000)
plt.xlim(0, 0.25)

# Red Noise Power Spectrum
plt.subplot(2, 2, 2)
plt.plot(frequencies2, power2, color='red')
plt.title('Power Spectrum: Red Noise')
plt.xlabel('Frequency')
plt.ylabel('Power')
plt.ylim(0, 80000)
plt.xlim(0, 0.25)

# Sinusoidal Combination Power Spectrum
plt.subplot(2, 2, 3)
plt.plot(frequencies3, power3, color='green')
plt.title('Power Spectrum: Sinusoidal Combination')
plt.xlabel('Frequency')
plt.ylabel('Power')
plt.ylim(0, 80000)
plt.xlim(0, 0.25)

# Random Walk Power Spectrum
plt.subplot(2, 2, 4)
plt.plot(frequencies4, power4, color='orange')
plt.title('Power Spectrum: Random Walk')
plt.xlabel('Frequency')
plt.ylabel('Power')
plt.ylim(0, 80000)
plt.xlim(0, 0.25)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 58: Plots showing Gumbel distributions with different parameters¶

In [ ]:
# Parameters for three different Gumbel distributions
params = [
    {'mu': 0, 'alpha': 1},
    {'mu': 1, 'alpha': 1},
    {'mu': 2, 'alpha': 1},
    {'mu': 2, 'alpha': 2}
]

# Generate x values
x = np.linspace(0, 15, 1500)

# Plot the distributions
plt.figure(figsize = (8, 6))
for param in params:
    mu = param['mu']
    alpha = param['alpha']
    rv = gumbel_r(loc = mu, scale = alpha)
    plt.plot(x, rv.pdf(x), label = f'$\mu$={mu}, $\\alpha$={alpha}')

# Add labels and legend
plt.title('Gumbel Distributions with Different $\mu$ and $\\alpha$')
plt.xlabel('x')
plt.ylabel('p')
plt.legend()
plt.grid(True)
plt.show()
<>:18: SyntaxWarning: invalid escape sequence '\m'
<>:21: SyntaxWarning: invalid escape sequence '\m'
<>:18: SyntaxWarning: invalid escape sequence '\m'
<>:21: SyntaxWarning: invalid escape sequence '\m'
C:\Users\nwi213\AppData\Local\Temp\ipykernel_7480\3351384232.py:18: SyntaxWarning: invalid escape sequence '\m'
  plt.plot(x, rv.pdf(x), label = f'$\mu$={mu}, $\\alpha$={alpha}')
C:\Users\nwi213\AppData\Local\Temp\ipykernel_7480\3351384232.py:21: SyntaxWarning: invalid escape sequence '\m'
  plt.title('Gumbel Distributions with Different $\mu$ and $\\alpha$')
No description has been provided for this image

Figure 59: Repeat plot of Dinkel rivier discharge¶

In [ ]:
# Load data
data = loadmat('TSA_example_Dinkel.mat')

# Extract time series
df = pd.DataFrame(np.concatenate((data['t'], data['data']), axis=1), columns=['t', 'data'])

# Plot time series
plt.figure(1, figsize = (12, 5)) # Create plot
plt.plot(df['t'] / 365, df['data'], linewidth = 0.7) # Plot the drainage data variable against variable t
plt.axhline(y = 31, color = 'red', linestyle = '--', label = 'Discharge of 31 m3/s')
plt.xlabel('time (years)') # Label x axis
plt.ylabel(r'$drainage~(m^3*s^{-1})$') # Label y axis
plt.title('Daily drainage of the Dinkel river')

# Calculate time discharge has exceeded 31 m3/s in dataset
print('Number of times discharge exceeded 31 m3/s = ', sum(i > 31 for i in df['data']))

# Calculate statistics of the dataset
print('mean =', df['data'].mean())
print('standard deviation =', df['data'].std())

# Calculate Gumbel parameters
print('alpha =', df['data'].std() * 0.77987)
print('mu =', df['data'].mean() - 0.5772 * df['data'].std() * 0.77987)
Number of times discharge exceeded 31 m3/s =  6
mean = 3.5609816833941608
standard deviation = 4.902482440750759
alpha = 3.823298981068294
mu = 1.3541735115215414
No description has been provided for this image

Figure 60: Histogram of discharge data with Gumbel distribution plotted on top¶

In [ ]:
# Load data
data = loadmat('TSA_example_Dinkel.mat')

# Extract time series
df = pd.DataFrame(np.concatenate((data['t'], data['data']), axis=1), columns=['t', 'data'])

# Calculate time discharge has exceeded 31 m3/s in dataset
print('Number of times discharge exceeded 31 m3/s = ', sum(i > 31 for i in df['data']))

# Calculate statistics of the dataset
mean_discharge = df['data'].mean()
sd_discharge =  df['data'].std()
print('mean =', mean_discharge)
print('standard deviation =', sd_discharge)

# Calculate Gumbel parameters
alpha = sd_discharge * 0.77987
mu = mean_discharge - 0.5772 * alpha
print('alpha =', alpha)
print('mu =', mu)


# Generate x values
x = np.linspace(0, 30, 3000)
Gumbel_y = gumbel_r(loc = mu, scale = alpha)

# Plot histogram
plt.hist(df['data'], density = True, color = "red", bins = 30, align = 'mid', rwidth = 0.9) # Plot histogram
plt.plot(x, Gumbel_y.pdf(x), label = f'$\mu$={1.354}, $\\alpha$={3.823}', color = 'blue')
plt.title('Histogram of daily Dinkel discharge data + Gumbel distribution')
plt.xlabel(r'$discharge~(m^3*s^{-1})$')
plt.ylabel('Frequency')
plt.xlim(0, 30)
plt.legend()

# Calculate probability of a discharge of >=31 m3/s
print((31 - mu) / alpha)
print('P(discharge <= 31 m3/s) = ', np.exp(-1 * np.exp(-1 * (31 - mu) / alpha)))
print('P(discharge > 31 m3/s) = ', 1 - np.exp(-1 * np.exp(-1 * (31 - mu) / alpha)))
print('Recurrence interval = ', 1 / (1 - np.exp(-1 * np.exp(-1 * (31 - mu) / alpha))), 'days')
print('Recurrence interval = ', 1 / (1 - np.exp(-1 * np.exp(-1 * (31 - mu) / alpha))) / 365.25, 'years')
<>:29: SyntaxWarning: invalid escape sequence '\m'
<>:29: SyntaxWarning: invalid escape sequence '\m'
C:\Users\nwi213\AppData\Local\Temp\ipykernel_7480\634642540.py:29: SyntaxWarning: invalid escape sequence '\m'
  plt.plot(x, Gumbel_y.pdf(x), label = f'$\mu$={1.354}, $\\alpha$={3.823}', color = 'blue')
Number of times discharge exceeded 31 m3/s =  6
mean = 3.5609816833941608
standard deviation = 4.902482440750759
alpha = 3.823298981068294
mu = 1.3541735115215414
7.7539911566620185
P(discharge <= 31 m3/s) =  0.999571065213064
P(discharge > 31 m3/s) =  0.00042893478693595277
Recurrence interval =  2331.3567247445403 days
Recurrence interval =  6.382906843927557 years
No description has been provided for this image

Figure 62: Example of three different spatial interpolation methods for one point¶

In [ ]:
# Step 1: Generate a Fake Spatial Dataset
np.random.seed(1)  # For reproducibility
grid_size = 10
x = np.random.uniform(0, 100, grid_size)
y = np.random.uniform(0, 100, grid_size)
values = np.random.uniform(100, 50, grid_size)

# Step 2: Choose a Point Not Represented in the Dataset
unknown_point = np.array([50, 50])  # A point within the grid

# Step 3: Define Interpolation Methods

def search_radius_interpolation(unknown_point, x, y, values, radius=20):
    distances = np.sqrt((x - unknown_point[0])**2 + (y - unknown_point[1])**2)
    within_radius = distances <= radius
    if np.any(within_radius):
        interpolated_value = np.mean(values[within_radius])
    else:
        interpolated_value = np.nan  # If no points are within the radius
    return interpolated_value, within_radius

def nearest_neighbor_interpolation(unknown_point, x, y, values):
    tree = cKDTree(list(zip(x, y)))
    dist, idx = tree.query(unknown_point)
    interpolated_value = values[idx]
    return interpolated_value, idx

def inverse_distance_weighting_interpolation(unknown_point, x, y, values, power=2):
    distances = np.sqrt((x - unknown_point[0])**2 + (y - unknown_point[1])**2)
    if np.any(distances == 0):
        return values[np.argmin(distances)], np.zeros_like(distances)  # Return the value if the point is exactly at a known point
    weights = 1 / distances**power
    interpolated_value = np.sum(weights * values) / np.sum(weights)
    return interpolated_value, weights

# Perform Interpolations
radius = 20
search_radius_value, within_radius = search_radius_interpolation(unknown_point, x, y, values, radius)
nearest_neighbor_value, nearest_idx = nearest_neighbor_interpolation(unknown_point, x, y, values)
idw_value, weights = inverse_distance_weighting_interpolation(unknown_point, x, y, values)

# Plotting

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Plot for Search Radius Interpolation
axs[0].scatter(x, y, c=values, cmap='coolwarm', label='Known points')
axs[0].scatter(*unknown_point, color='green', label='Unknown point')
circle = plt.Circle(unknown_point, radius, color='blue', fill=False, linestyle='--', label='Search Radius')
axs[0].add_patch(circle)
axs[0].scatter(x[within_radius], y[within_radius], facecolors='none', edgecolors='blue', s=100, label='Points within Radius')
axs[0].set_title(f'A. Search Radius Interpolation\nInterpolated Value: {search_radius_value:.2f}')
axs[0].legend()
axs[0].set_xlabel('X')
axs[0].set_ylabel('Y')

# Plot for Nearest Neighbor Interpolation
axs[1].scatter(x, y, c=values, cmap='coolwarm', label='Known points')
axs[1].scatter(*unknown_point, color='green', label='Unknown point')
axs[1].plot([unknown_point[0], x[nearest_idx]], [unknown_point[1], y[nearest_idx]], 'k--', label='Nearest Neighbor Line')
axs[1].scatter(x[nearest_idx], y[nearest_idx], color='orange', s=100, label='Nearest Neighbor')
axs[1].set_title(f'B. Nearest Neighbor Interpolation\nInterpolated Value: {nearest_neighbor_value:.2f}')
axs[1].legend()
axs[1].set_xlabel('X')
axs[1].set_ylabel('Y')

# Plot for IDW Interpolation
sc = axs[2].scatter(x, y, c=values, cmap='coolwarm', label='Known points')
axs[2].scatter(*unknown_point, color='green', label='Unknown point')
for i in range(len(x)):
    axs[2].plot([unknown_point[0], x[i]], [unknown_point[1], y[i]], 'k-', alpha=weights[i], lw=2 * weights[i], label='Weighted line' if i == 0 else "")
axs[2].set_title(f'C. Inverse Distance Weighting (IDW)\nInterpolated Value: {idw_value:.2f}')
axs[2].legend()
axs[2].set_xlabel('X')
axs[2].set_ylabel('Y')

plt.tight_layout()
plt.show()

# Output Results
print(f"Search Radius Interpolation Value: {search_radius_value}")
print(f"Nearest Neighbor Interpolation Value: {nearest_neighbor_value}")
print(f"Inverse Distance Weighting (IDW) Interpolation Value: {idw_value}")
No description has been provided for this image
Search Radius Interpolation Value: 79.00501620228953
Nearest Neighbor Interpolation Value: 59.96277156622317
Inverse Distance Weighting (IDW) Interpolation Value: 71.23961464234867

Figure 63: Examples of spatial interpolation methods filling the entire area¶

In [ ]:
# Step 1: Generate a Fake Spatial Dataset
np.random.seed(1)  # For reproducibility
grid_size = 100
x = np.random.uniform(0, 100, grid_size)
y = np.random.uniform(0, 100, grid_size)
values = np.random.uniform(10, 50, grid_size)

# Step 2: Create a dense grid of points to fill the entire space
grid_x, grid_y = np.meshgrid(np.linspace(0, 100, 200), np.linspace(0, 100, 200))
grid_points = np.c_[grid_x.ravel(), grid_y.ravel()]

# Step 3: Define Interpolation Methods for the Dense Grid

def search_radius_interpolation_grid(grid_points, x, y, values, radius=20):
    interpolated_values = np.full(grid_points.shape[0], np.nan)
    for i, point in enumerate(grid_points):
        distances = np.sqrt((x - point[0])**2 + (y - point[1])**2)
        within_radius = distances <= radius
        if np.any(within_radius):
            interpolated_values[i] = np.mean(values[within_radius])
    return interpolated_values

def nearest_neighbor_interpolation_grid(grid_points, x, y, values):
    tree = cKDTree(list(zip(x, y)))
    dist, idx = tree.query(grid_points)
    interpolated_values = values[idx]
    return interpolated_values

def inverse_distance_weighting_interpolation_grid(grid_points, x, y, values, power=2):
    interpolated_values = np.zeros(grid_points.shape[0])
    for i, point in enumerate(grid_points):
        distances = np.sqrt((x - point[0])**2 + (y - point[1])**2)
        if np.any(distances == 0):
            interpolated_values[i] = values[np.argmin(distances)]  # Exact point
        else:
            weights = 1 / distances**power
            interpolated_values[i] = np.sum(weights * values) / np.sum(weights)
    return interpolated_values

# Perform Interpolations on the Dense Grid
search_radius_values = search_radius_interpolation_grid(grid_points, x, y, values, radius=20)
nearest_neighbor_values = nearest_neighbor_interpolation_grid(grid_points, x, y, values)
idw_values = inverse_distance_weighting_interpolation_grid(grid_points, x, y, values)

# Reshape the results to match the grid
search_radius_values = search_radius_values.reshape(grid_x.shape)
nearest_neighbor_values = nearest_neighbor_values.reshape(grid_x.shape)
idw_values = idw_values.reshape(grid_x.shape)

# Plotting

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Plot for Search Radius Interpolation
im1 = axs[0].imshow(search_radius_values, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[0].scatter(x, y, c=values, cmap='coolwarm', edgecolor='black')
axs[0].set_title('Search Radius Interpolation')
plt.colorbar(im1, ax=axs[0])

# Plot for Nearest Neighbor Interpolation
im2 = axs[1].imshow(nearest_neighbor_values, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[1].scatter(x, y, c=values, cmap='coolwarm', edgecolor='black')
axs[1].set_title('Nearest Neighbor Interpolation')
plt.colorbar(im2, ax=axs[1])

# Plot for IDW Interpolation
im3 = axs[2].imshow(idw_values, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[2].scatter(x, y, c=values, cmap='coolwarm', edgecolor='black')
axs[2].set_title('Inverse Distance Weighting (IDW)')
plt.colorbar(im3, ax=axs[2])

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 65: Positive, negative and zero autocorrelation¶

In [ ]:
# Set random seed for reproducibility
np.random.seed(42)

# Generate random spatial coordinates
n_points = 100
x = np.random.uniform(0, 100, n_points)
y = np.random.uniform(0, 100, n_points)

# Function to calculate inverse distance weighting
def inverse_distance_weighting(x, y, values, grid_x, grid_y, power=2):
    grid_values = np.zeros_like(grid_x)
    for i in range(grid_x.shape[0]):
        for j in range(grid_x.shape[1]):
            distances = np.sqrt((x - grid_x[i, j])**2 + (y - grid_y[i, j])**2)
            if np.any(distances == 0):
                grid_values[i, j] = values[np.argmin(distances)]
            else:
                weights = 1 / (distances**power)
                grid_values[i, j] = np.sum(weights * values) / np.sum(weights)
    return grid_values

# Generate datasets with different types of spatial autocorrelation

# 1. Positive Spatial Autocorrelation
def generate_positive_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] += values[j] * np.exp(-distance / spatial_scale)
    return values

# 2. Negative Spatial Autocorrelation
def generate_negative_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] -= values[j] * np.exp(-distance / spatial_scale)
    return values

# 3. Zero Spatial Autocorrelation
def generate_zero_autocorrelation(n_points):
    values = np.random.uniform(-1, 1, n_points)
    return values

# Generate the datasets
positive_autocorr_values = generate_positive_autocorrelation(n_points)
negative_autocorr_values = generate_negative_autocorrelation(n_points)
zero_autocorr_values = generate_zero_autocorrelation(n_points)

# Create a dense grid for interpolation
grid_x, grid_y = np.meshgrid(np.linspace(0, 100, 200), np.linspace(0, 100, 200))

# Perform IDW interpolation on each dataset
idw_positive = inverse_distance_weighting(x, y, positive_autocorr_values, grid_x, grid_y)
idw_negative = inverse_distance_weighting(x, y, negative_autocorr_values, grid_x, grid_y)
idw_zero = inverse_distance_weighting(x, y, zero_autocorr_values, grid_x, grid_y)

# Plotting the datasets with larger symbols and interpolated surfaces
fig, axs = plt.subplots(2, 3, figsize=(18, 12))

# Define symbol size
symbol_size = 100  # Increased size for symbols

# Positive Spatial Autocorrelation Plot
sc1 = axs[0, 0].scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[0, 0].set_title('Positive Spatial Autocorrelation')
plt.colorbar(sc1, ax=axs[0, 0])
axs[0, 0].set_xlabel('X')
axs[0, 0].set_ylabel('Y')

# Negative Spatial Autocorrelation Plot
sc2 = axs[0, 1].scatter(x, y, c=negative_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[0, 1].set_title('Negative Spatial Autocorrelation')
plt.colorbar(sc2, ax=axs[0, 1])
axs[0, 1].set_xlabel('X')
axs[0, 1].set_ylabel('Y')

# Zero Spatial Autocorrelation Plot
sc3 = axs[0, 2].scatter(x, y, c=zero_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[0, 2].set_title('Zero Spatial Autocorrelation')
plt.colorbar(sc3, ax=axs[0, 2])
axs[0, 2].set_xlabel('X')
axs[0, 2].set_ylabel('Y')

# Interpolated Positive Spatial Autocorrelation Plot
im1 = axs[1, 0].imshow(idw_positive, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[1, 0].scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[1, 0].set_title('IDW Interpolation (Positive Autocorrelation)')
plt.colorbar(im1, ax=axs[1, 0])
axs[1, 0].set_xlabel('X')
axs[1, 0].set_ylabel('Y')

# Interpolated Negative Spatial Autocorrelation Plot
im2 = axs[1, 1].imshow(idw_negative, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[1, 1].scatter(x, y, c=negative_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[1, 1].set_title('IDW Interpolation (Negative Autocorrelation)')
plt.colorbar(im2, ax=axs[1, 1])
axs[1, 1].set_xlabel('X')
axs[1, 1].set_ylabel('Y')

# Interpolated Zero Spatial Autocorrelation Plot
im3 = axs[1, 2].imshow(idw_zero, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
axs[1, 2].scatter(x, y, c=zero_autocorr_values, cmap='coolwarm', edgecolor='black', s=symbol_size)
axs[1, 2].set_title('IDW Interpolation (Zero Autocorrelation)')
plt.colorbar(im3, ax=axs[1, 2])
axs[1, 2].set_xlabel('X')
axs[1, 2].set_ylabel('Y')

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 67: Variogram of datasets with positive, negative and zero autocorrelation¶

In [ ]:
# Set random seed for reproducibility
np.random.seed(1)

# Generate random spatial coordinates
n_points = 100
x = np.random.uniform(0, 100, n_points)
y = np.random.uniform(0, 100, n_points)

# Function to calculate variogram
def calculate_variogram(x, y, values, max_lag, n_lags):
    # Calculate all pairwise distances
    distances = np.sqrt((x[:, None] - x[None, :])**2 + (y[:, None] - y[None, :])**2)
    # Calculate all pairwise squared differences
    sq_diffs = (values[:, None] - values[None, :])**2
    
    # Create lag bins
    lag_bins = np.linspace(0, max_lag, n_lags + 1)
    variogram = np.zeros(n_lags)
    lag_counts = np.zeros(n_lags)

    # Calculate the semi-variance for each lag bin
    for i in range(n_lags):
        # Find indices of pairs that fall into the current lag bin
        indices = np.where((distances >= lag_bins[i]) & (distances < lag_bins[i + 1]))
        # Calculate mean semi-variance for the current lag bin
        if len(indices[0]) > 0:
            variogram[i] = np.mean(sq_diffs[indices]) / 2
            lag_counts[i] = len(indices[0])

    # Midpoints of lag bins for plotting
    lag_bin_centers = (lag_bins[:-1] + lag_bins[1:]) / 2
    return lag_bin_centers, variogram

# Generate datasets with different types of spatial autocorrelation

# 1. Positive Spatial Autocorrelation
def generate_positive_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] += values[j] * np.exp(-distance / spatial_scale)
    return values

# 2. Negative Spatial Autocorrelation
def generate_negative_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] -= values[j] * np.exp(-distance / spatial_scale)
    return values

# 3. Zero Spatial Autocorrelation
def generate_zero_autocorrelation(n_points):
    values = np.random.uniform(-1, 1, n_points)
    return values

# Generate the datasets
positive_autocorr_values = generate_positive_autocorrelation(n_points)
negative_autocorr_values = generate_negative_autocorrelation(n_points)
zero_autocorr_values = generate_zero_autocorrelation(n_points)

# Calculate variograms
max_lag = 50
n_lags = 10
lag_centers_pos, variogram_pos = calculate_variogram(x, y, positive_autocorr_values, max_lag, n_lags)
lag_centers_neg, variogram_neg = calculate_variogram(x, y, negative_autocorr_values, max_lag, n_lags)
lag_centers_zero, variogram_zero = calculate_variogram(x, y, zero_autocorr_values, max_lag, n_lags)

# Plotting the variograms
plt.figure(figsize=(18, 6))

# Positive Spatial Autocorrelation Variogram
plt.subplot(1, 3, 1)
plt.plot(lag_centers_pos, variogram_pos, marker='o', linestyle='-', color='blue')
plt.title('Variogram (Positive Spatial Autocorrelation)')
plt.xlabel('Lag Distance')
plt.ylabel('Semi-Variance')

# Negative Spatial Autocorrelation Variogram
plt.subplot(1, 3, 2)
plt.plot(lag_centers_neg, variogram_neg, marker='o', linestyle='-', color='green')
plt.title('Variogram (Negative Spatial Autocorrelation)')
plt.xlabel('Lag Distance')
plt.ylabel('Semi-Variance')

# Zero Spatial Autocorrelation Variogram
plt.subplot(1, 3, 3)
plt.plot(lag_centers_zero, variogram_zero, marker='o', linestyle='-', color='red')
plt.title('Variogram (Zero Spatial Autocorrelation)')
plt.xlabel('Lag Distance')
plt.ylabel('Semi-Variance')

plt.tight_layout()
plt.show()
No description has been provided for this image

Figure 70: Example of applying kriging to dataset with positive spatial autocorrelation¶

In [ ]:
# Set random seed for reproducibility
np.random.seed(1)

# Generate random spatial coordinates for a larger dataset
n_points = 250
x = np.random.uniform(0, 100, n_points)
y = np.random.uniform(0, 100, n_points)

# Function to generate positive spatial autocorrelation dataset
def generate_positive_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] += values[j] * np.exp(-distance / spatial_scale)
    return values

# Generate dataset with positive spatial autocorrelation
positive_autocorr_values = generate_positive_autocorrelation(n_points)

# Create a dense grid for kriging interpolation
grid_x, grid_y = np.meshgrid(np.linspace(0, 100, 200), np.linspace(0, 100, 200))

# Calculate pairwise distances between points
def calculate_distances(x, y):
    x_diff = x[:, np.newaxis] - x[np.newaxis, :]
    y_diff = y[:, np.newaxis] - y[np.newaxis, :]
    distances = np.sqrt(x_diff ** 2 + y_diff ** 2)
    return distances

distances = calculate_distances(x, y)

# Fit a simple linear variogram model
def linear_variogram_model(distances, nugget=0, sill=1, range_=30):
    variogram = nugget + sill * (distances / range_)
    variogram[distances > range_] = sill
    return variogram

variogram_model = linear_variogram_model(distances)

# Set up the kriging system
def kriging_weights(distances, variogram_model, values, grid_points):
    n = distances.shape[0]
    weights = np.zeros((grid_points.shape[0], n))
    
    for i, (gx, gy) in enumerate(grid_points):
        dists_to_point = np.sqrt((x - gx) ** 2 + (y - gy) ** 2)
        variogram_values = linear_variogram_model(dists_to_point)
        system_matrix = np.zeros((n + 1, n + 1))
        system_matrix[:n, :n] = variogram_model
        system_matrix[n, :-1] = 1
        system_matrix[:-1, n] = 1
        system_matrix[n, n] = 0
        
        rhs = np.append(variogram_values, 1)
        weights[i, :] = np.linalg.solve(system_matrix, rhs)[:-1]
    
    return weights

# Perform kriging interpolation
grid_points = np.c_[grid_x.ravel(), grid_y.ravel()]
weights = kriging_weights(distances, variogram_model, positive_autocorr_values, grid_points)
kriged_values = np.dot(weights, positive_autocorr_values)

# Reshape kriged values to the grid shape
kriged_values_grid = kriged_values.reshape(grid_x.shape)

# Plotting the original data and kriged result
plt.figure(figsize=(12, 6))

# Plot original data with positive spatial autocorrelation
plt.subplot(1, 2, 1)
plt.scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=20)
plt.title('Original Data\n(Positive Spatial Autocorrelation, n = 250)')
plt.xlabel('X')
plt.ylabel('Y')
plt.colorbar(label='Value')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.gca().set_aspect('equal', adjustable='box')  # Ensure equal scaling

# Plot the kriging interpolated map
plt.subplot(1, 2, 2)
plt.imshow(kriged_values_grid, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm')
plt.scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=50)
plt.title('Kriging Interpolation')
plt.xlabel('X')
plt.ylabel('Y')
plt.colorbar(label='Interpolated Value')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.gca().set_aspect('equal', adjustable='box')  # Ensure equal scaling

plt.tight_layout()
plt.show()
No description has been provided for this image

Repeat Fig 70 with exponential variogram model¶

In [ ]:
# Set random seed for reproducibility
np.random.seed(1)

# Generate random spatial coordinates for a larger dataset
n_points = 250
x = np.random.uniform(0, 100, n_points)
y = np.random.uniform(0, 100, n_points)

# Function to generate positive spatial autocorrelation dataset
def generate_positive_autocorrelation(n_points, spatial_scale=10):
    values = np.zeros(n_points)
    for i in range(n_points):
        values[i] = np.random.normal(loc=0, scale=1)
        for j in range(i):
            distance = np.sqrt((x[i] - x[j])**2 + (y[i] - y[j])**2)
            if distance < spatial_scale:
                values[i] += values[j] * np.exp(-distance / spatial_scale)
    return values

# Generate dataset with positive spatial autocorrelation
positive_autocorr_values = generate_positive_autocorrelation(n_points)

# Create a dense grid for kriging interpolation
grid_x, grid_y = np.meshgrid(np.linspace(0, 100, 200), np.linspace(0, 100, 200))

# Calculate pairwise distances between points
def calculate_distances(x, y):
    x_diff = x[:, np.newaxis] - x[np.newaxis, :]
    y_diff = y[:, np.newaxis] - y[np.newaxis, :]
    distances = np.sqrt(x_diff ** 2 + y_diff ** 2)
    return distances

distances = calculate_distances(x, y)

# Fit an exponential variogram model
def exponential_variogram_model(distances, nugget=0, sill=1, range_=30):
    variogram = nugget + (sill - nugget) * (1 - np.exp(-distances / range_))
    return variogram

variogram_model = exponential_variogram_model(distances)

# Set up the kriging system
def kriging_weights(distances, variogram_model, values, grid_points):
    n = distances.shape[0]
    weights = np.zeros((grid_points.shape[0], n))
    
    for i, (gx, gy) in enumerate(grid_points):
        dists_to_point = np.sqrt((x - gx) ** 2 + (y - gy) ** 2)
        variogram_values = exponential_variogram_model(dists_to_point)
        system_matrix = np.zeros((n + 1, n + 1))
        system_matrix[:n, :n] = variogram_model
        system_matrix[n, :-1] = 1
        system_matrix[:-1, n] = 1
        system_matrix[n, n] = 0
        
        rhs = np.append(variogram_values, 1)
        weights[i, :] = np.linalg.solve(system_matrix, rhs)[:-1]
    
    return weights

# Perform kriging interpolation
grid_points = np.c_[grid_x.ravel(), grid_y.ravel()]
weights = kriging_weights(distances, variogram_model, positive_autocorr_values, grid_points)
kriged_values = np.dot(weights, positive_autocorr_values)

# Reshape kriged values to the grid shape
kriged_values_grid = kriged_values.reshape(grid_x.shape)

# Set up figure size and layout
plt.figure(figsize=(14, 7))

# Plot original data with positive spatial autocorrelation
plt.subplot(1, 2, 1)
plt.scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=50)
plt.title('Original Data\n(Positive Spatial Autocorrelation, n = 250)')
plt.xlabel('X')
plt.ylabel('Y')
plt.colorbar(label='Value')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.gca().set_aspect('equal', adjustable='box')  # Ensure equal scaling

# Plot the kriging interpolated map with original points overlaid
plt.subplot(1, 2, 2)
plt.imshow(kriged_values_grid, extent=(0, 100, 0, 100), origin='lower', cmap='coolwarm', aspect='equal')
plt.colorbar(label='Interpolated Value')
plt.scatter(x, y, c=positive_autocorr_values, cmap='coolwarm', edgecolor='black', s=50)
plt.title('Kriging Interpolation (Exponential Variogram Model)')
plt.xlabel('X')
plt.ylabel('Y')
plt.xlim(0, 100)
plt.ylim(0, 100)
plt.gca().set_aspect('equal', adjustable='box')  # Ensure equal scaling

plt.tight_layout()
plt.show()
No description has been provided for this image